I came accross the following youtube channel where David Robinson does analyses on different datasets. I saw the following video where he performed an analysis on NYC squirrels and thought I would give it a shot of my own!
The following libraries will be used in this analysis. If it is your first time using these packages, also use the ‘install.packages’ function.
#install.packages("rlang")
library(rlang)
#install.packages("ggmap")
library(ggmap)
#install.packages("gam")
library(gam)
#install.packages("gridExtra")
library(gridExtra)
nyc_squirrels <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-29/nyc_squirrels.csv")
# Convert to Data Frame
sq_dat <- as.data.frame(nyc_squirrels)
# The following plots are how I new to drop a few of the fields
# Using some quick and very ugly plots, we can quickly see that some of these fields are useless at the moment.
# In the event that the squirrel dataset is added to using sightings from different locations across NYC, this may change
par(mfrow=c(2,3),bg = '#222222')
for(var in c("community_districts","borough_boundaries","city_council_districts","police_precincts","zip_codes")){
plot_dist <- na.omit(unique(sq_dat[,c("lat","long",var)]))
plot(plot_dist$long,plot_dist$lat,col=c("black","blue","red","purple","green")[as.factor(plot_dist[,var])],pch=15,xlab="Longitude",ylab="Latitude",axis="",adj=0,main=var)
}
par(mfrow=c(1,1))
data <- sq_dat[,!(colnames(sq_dat) %in% c(
"above_ground_sighter_measurement",
"combination_of_primary_and_highlight_color",
"unique_squirrel_id",
"lat_long",
"zip_codes",
"community_districts",
"borough_boundaries",
"city_council_districts",
"police_precincts"
))]
# Needed to overlay plots on maps
data[,"lon"] <- data[,"long"]
data[,"adult"] <- ifelse(data[,"age"]=="Adult",TRUE,FALSE)
data[,"above_ground"] <- ifelse(data[,"location"]=="Above Ground",TRUE,FALSE)
data[,"primary_Gray"] <- ifelse(data[,"primary_fur_color"]=="Gray",TRUE,FALSE)
data[,"primary_Cinnamon"] <- ifelse(data[,"primary_fur_color"]=="Cinnamon",TRUE,FALSE)
data[,"primary_Black"] <- ifelse(data[,"primary_fur_color"]=="Black",TRUE,FALSE)
data[,"primary_Gray"] <- ifelse(is.na(data[,"primary_fur_color"]),FALSE,TRUE)
data[,"primary_Cinnamon"] <- ifelse(is.na(data[,"primary_fur_color"]),FALSE,TRUE)
data[,"primary_Black"] <- ifelse(is.na(data[,"primary_fur_color"]),FALSE,TRUE)
data[,"colors"] <- apply(data[,c("primary_fur_color","highlight_fur_color")],1,paste,collapse=",")
data[,"Gray"] <- grepl("Gray",data[,"colors"])
data[,"Cinnamon"] <- grepl("Cinnamon",data[,"colors"])
data[,"Black"] <- grepl("Black",data[,"colors"])
data[,"White"] <- grepl("White",data[,"colors"])
data[,"UniqueColors"] <- apply(data[,c("Gray","Cinnamon","Black","White")],1,sum)
data[,"MultiColor"] <- ifelse(data[,"UniqueColors"]>1,TRUE,ifelse(data[,"UniqueColors"]==0,NA,FALSE))
# Convert hectare to grid x and y values
data[,"y"] <- as.numeric(substr(data[,"hectare"],1,2))
data[,"x"] <- apply(as.data.frame(substr(data[,"hectare"],3,3)),1,function(x){(1:9)[x==c("A","B","C","D","E","F","G","H","I")]})
# Plot showing domain of sightings
map <- get_map(location = c(min(data$long), min(data$lat), max(data$long), max(data$lat)), source="google", maptype="satellite", crop=FALSE)
dput(map, file = "myMaps")
map <- dget(file = "myMaps")
ggmap(map)+
geom_point(data=data[,c("lon","lat")])+
theme(plot.background = element_rect(fill = '#222222', colour = '#222222'),
panel.background = element_rect(fill = '#222222', colour = '#222222'))
# This illustrates that hectare is just used to reorient the values to be rectangular instead of angled
hect <- aggregate(data,by=list(data[,"hectare"]),mean)
n <- aggregate(rep(1,nrow(data)),by=list(data[,"hectare"]),sum)
hect <- cbind(n[,2],hect)
colnames(hect)[1] <- "n"
size <- 1+6*hect[,"n"]/max(hect[,"n"])
ggmap(map)+
geom_point(data=hect[,c("lon","lat")],aes(colour=factor(hect[,"Group.1"])),size=size)+
theme(legend.position="none",
plot.background = element_rect(fill = '#222222', colour = '#222222'),
panel.background = element_rect(fill = '#222222', colour = '#222222'))
# What date range was this data collected over?
date_to_text <- function(x){
# Thank goodness this all happend in the same month and didnt happen both after the 9th and before the 10th!
date <- paste(substring(x,5,9),substring(x,1,2),substring(x,3,4),sep="-")
day <- weekdays(as.Date(date))
#empties <- 10-as.data.frame(apply(as.data.frame(day),1,nchar))
#print(min(empties))
#apply(empties,1,rep," ")
return(paste(date,day,sep="\n"))
}
data[,"date_chr"] <- apply(data[,"date",drop=FALSE],1,date_to_text)
head(data[,"date_chr"])
## [1] "2018-10-14\nSunday" "2018-10-06\nSaturday" "2018-10-10\nWednesday"
## [4] "2018-10-18\nThursday" "2018-10-18\nThursday" "2018-10-19\nFriday"
samples_by_date <- aggregate(data[,"date_chr"],by=list(data[,"date_chr"]),function(x){length(x)})
# Default is c(5, 4, 4, 2) + 0.1
# Give extra space to show full date
par(mar=c(5.5, 8, 4, 2) + 0.1,mfrow=c(1,1),bg = '#222222')
counts <- table(data[,"date_chr"],by=data[,"shift"])
barplot(t(counts), main="Sightings by Date - AM vs PM",
xlab="Sightings" ,horiz=TRUE,las=2,cex.axis=1.25,cex.names=1.25,cex.lab=1.25,adj=0,cex.main=2,col = c("orange","#000055"))
models <- list()
# List of all boolean variables
variables <- c("Gray","Black","White","Cinnamon","MultiColor","above_ground","adult","foraging","running","chasing","climbing","eating","kuks","quaas","moans","tail_flags","tail_twitches","approaches","indifferent","runs_from")
plot_var <- function(variable){
# Predict variable using latitude and longitude
form <-as.formula(paste(variable,"~s(lon,lat)",sep=""))
fit <- mgcv::gam(formula=form,data=data[,c("lat","lon",variable)], family = binomial("logit"))
models <- append(models,fit)
pval <- paste(round(summary(fit)$s.pv,4)*100,"%",sep="")
rsq <- paste(round(summary(fit)$r.sq,4)*100,"%",sep="")
avg <- round(100*mean(na.omit(data[,variable])),digits = 1)
# Get predictions for corresponding hectares
preds <- predict(fit,newdata=hect,type="response")
# Get predictions to create an overall heatmap
miny <- min(data$lat)
maxy <- max(data$lat)
minx <- min(data$lon)
maxx <- max(data$lon)
grid <- expand.grid(seq(minx,maxx,(maxx-minx)/40),seq(miny,maxy,(maxy-miny)/60))
colnames(grid) <- c("lon","lat")
grid_preds <- predict(fit,newdata=grid,type="response")
grid <- cbind(grid,grid_preds)
#Plot Actual and predicted given latitude and longitude
# size of grids are always at least 1, and up to 6. The largest sample was max(hect[,"n"]) = 32
size <- 1+6*hect[,"n"]/max(hect[,"n"])
# Average by grid
raw_plot <-
ggmap(map)+
theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(),
axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank())+
geom_point(data=hect[,c("lon","lat")],aes(colour=hect[,variable]),size=size) +
scale_color_gradientn(colours = rainbow(5), limits=c(0,1),name=variable)
# Prediction by grid
pred_plot <-
ggmap(map)+
theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(),
axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank())+
geom_point(data=hect[,c("lon","lat")],aes(colour=preds),size=size) +
scale_color_gradientn(colours = rainbow(5), limits=c(0,1),breaks=c(round(min(preds),3),round(max(preds),3)),name="Predictions")+
geom_text(data=hect[preds==max(preds),,drop=FALSE][1,],aes(lon,lat,label=paste(100*round(max(preds),3),"%")),size=5)+
geom_text(data=hect[preds==min(preds),,drop=FALSE][1,],aes(lon,lat,label=paste(100*round(min(preds),3),"%")),size=5)
# Prediction by grid (make it more like a heat map)
pred_heatmap <-
ggmap(map)+
theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(),
axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank())+
geom_point(data=grid,aes(colour=grid_preds),size=5,shape=18,alpha=.4) +
scale_color_gradientn(colours = rainbow(5), limits=c(0,1),breaks=c(round(min(preds),3),round(max(preds),3)),name="HeatMap")+
geom_text(data=hect[preds==max(preds),,drop=FALSE][1,],aes(lon,lat,label=paste(100*round(max(preds),3),"%")),size=5)+
geom_text(data=hect[preds==min(preds),,drop=FALSE][1,],aes(lon,lat,label=paste(100*round(min(preds),3),"%")),size=5)
grid.arrange(raw_plot, pred_plot, pred_heatmap, ncol = 1,top=paste(toupper(variable),": \nAverage Value: ",avg,"%\nR-Squared: ",rsq,"\nP-Value: ", pval,sep=""))
par(mfrow=c(1,1))
}
par(bg = '#222222')
plot_var("Gray")
This variable descirbes if the squirrel had any black present, primary or otherwise.
par(bg = '#222222')
plot_var("Black")